{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "origin_pos": 0
   },
   "source": [
    "## 03自动并行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-18T07:11:59.505418Z",
     "iopub.status.busy": "2023-08-18T07:11:59.504686Z",
     "iopub.status.idle": "2023-08-18T07:12:02.958789Z",
     "shell.execute_reply": "2023-08-18T07:12:02.957933Z"
    },
    "origin_pos": 2,
    "tab": [
     "pytorch"
    ]
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "from d2l import torch as d2l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-18T07:12:02.987012Z",
     "iopub.status.busy": "2023-08-18T07:12:02.986327Z",
     "iopub.status.idle": "2023-08-18T07:12:05.221346Z",
     "shell.execute_reply": "2023-08-18T07:12:05.220262Z"
    },
    "origin_pos": 6,
    "tab": [
     "pytorch"
    ]
   },
   "outputs": [],
   "source": [
    "devices = d2l.try_all_gpus()\n",
    "def run(x):\n",
    "    return [x.mm(x) for _ in range(50)]\n",
    "\n",
    "x_gpu1 = torch.rand(size=(4000, 4000), device=devices[0])\n",
    "x_gpu2 = torch.rand(size=(4000, 4000), device=devices[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-18T07:12:05.225646Z",
     "iopub.status.busy": "2023-08-18T07:12:05.224864Z",
     "iopub.status.idle": "2023-08-18T07:12:07.664593Z",
     "shell.execute_reply": "2023-08-18T07:12:07.663740Z"
    },
    "origin_pos": 12,
    "tab": [
     "pytorch"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GPU1 time: 0.4600 sec\n",
      "GPU2 time: 0.4706 sec\n"
     ]
    }
   ],
   "source": [
    "run(x_gpu1)\n",
    "run(x_gpu2)  # 预热设备\n",
    "torch.cuda.synchronize(devices[0])\n",
    "torch.cuda.synchronize(devices[1])\n",
    "\n",
    "with d2l.Benchmark('GPU1 time'):\n",
    "    run(x_gpu1)\n",
    "    torch.cuda.synchronize(devices[0])\n",
    "\n",
    "with d2l.Benchmark('GPU2 time'):\n",
    "    run(x_gpu2)\n",
    "    torch.cuda.synchronize(devices[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-18T07:12:07.668313Z",
     "iopub.status.busy": "2023-08-18T07:12:07.667763Z",
     "iopub.status.idle": "2023-08-18T07:12:08.130167Z",
     "shell.execute_reply": "2023-08-18T07:12:08.129377Z"
    },
    "origin_pos": 18,
    "tab": [
     "pytorch"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GPU1 & GPU2: 0.4580 sec\n"
     ]
    }
   ],
   "source": [
    "with d2l.Benchmark('GPU1 & GPU2'):\n",
    "    run(x_gpu1)\n",
    "    run(x_gpu2)\n",
    "    torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-18T07:12:08.133753Z",
     "iopub.status.busy": "2023-08-18T07:12:08.133184Z",
     "iopub.status.idle": "2023-08-18T07:12:10.950227Z",
     "shell.execute_reply": "2023-08-18T07:12:10.949308Z"
    },
    "origin_pos": 22,
    "tab": [
     "pytorch"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "在GPU1上运行: 0.4608 sec\n",
      "复制到CPU: 2.3504 sec\n"
     ]
    }
   ],
   "source": [
    "def copy_to_cpu(x, non_blocking=False):\n",
    "    return [y.to('cpu', non_blocking=non_blocking) for y in x]\n",
    "\n",
    "with d2l.Benchmark('在GPU1上运行'):\n",
    "    y = run(x_gpu1)\n",
    "    torch.cuda.synchronize()\n",
    "\n",
    "with d2l.Benchmark('复制到CPU'):\n",
    "    y_cpu = copy_to_cpu(y)\n",
    "    torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-18T07:12:10.954084Z",
     "iopub.status.busy": "2023-08-18T07:12:10.953336Z",
     "iopub.status.idle": "2023-08-18T07:12:12.728692Z",
     "shell.execute_reply": "2023-08-18T07:12:12.727837Z"
    },
    "origin_pos": 28,
    "tab": [
     "pytorch"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "在GPU1上运行并复制到CPU: 1.7703 sec\n"
     ]
    }
   ],
   "source": [
    "with d2l.Benchmark('在GPU1上运行并复制到CPU'):\n",
    "    y = run(x_gpu1)\n",
    "    y_cpu = copy_to_cpu(y, True)\n",
    "    torch.cuda.synchronize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "required_libs": []
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
